{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Manifold Learning - How to improve Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Unsupervised Learning can be useful in the Supervised Setting as well"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Automatically created module for IPython interactive environment\n"
     ]
    }
   ],
   "source": [
    "print(__doc__)\n",
    "\n",
    "import random\n",
    "import numpy as np\n",
    "#import matplotlib.pyplot as plt\n",
    "#from mpl_toolkits.mplot3d import Axes3D\n",
    "#from matplotlib.ticker import NullFormatter\n",
    "#%matplotlib inline\n",
    "from sklearn import manifold, datasets\n",
    "from sklearn import random_projection\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "from sklearn import neighbors, linear_model\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Toy problem - Classify digits using KNN and Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset dimensions: (1797, 64)\n"
     ]
    }
   ],
   "source": [
    "digits = datasets.load_digits(n_class=10)\n",
    "X_digits = digits.data\n",
    "y_digits = digits.target\n",
    "n_samples, n_features = X_digits.shape\n",
    "print('Dataset dimensions:',X_digits.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN score: 0.977747\n",
      "LogisticRegression score: 0.947149\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X_digits,y_digits,test_size=0.4,random_state=42, stratify=y_digits)\n",
    "\n",
    "# 64 features\n",
    "knn = neighbors.KNeighborsClassifier(n_neighbors=5)\n",
    "logistic = linear_model.LogisticRegression()\n",
    "\n",
    "print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression score: %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test set dimenstions: (719, 64)\n"
     ]
    }
   ],
   "source": [
    "print('Test set dimenstions:',X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of KNN errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "round(X_test.shape[0]-X_test.shape[0]*knn.fit(X_train, y_train).score(X_test, y_test),0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Automated feature engineering\n",
    "\n",
    "## Try out different manifold-learned features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_additional_features = 20 # for all methods but tSNE\n",
    "\n",
    "#transformer = random_projection.SparseRandomProjection(n_additional_features, random_state=42)\n",
    "#transformer = random_projection.GaussianRandomProjection(n_additional_features, random_state=0)\n",
    "#transformer = manifold.MDS(n_additional_features, max_iter=300, n_init=3) # works with 20 add feat\n",
    "#transformer = manifold.TSNE(n_components=3, init='pca', random_state=42)\n",
    "transformer = manifold.TSNE(n_components=3, perplexity=15, random_state=42) # random init\n",
    "#transformer = PCA(n_additional_features) # works with 13 or 20 add feat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "New_features_train = transformer.fit_transform(X_train)\n",
    "X_train2 = np.c_[X_train, New_features_train]\n",
    "#X_train2 = New_features_train\n",
    "\n",
    "# Calculate transformation for test data by hand (there is no fit function for the transformation)\n",
    "Handmade_fit_transform = np.linalg.lstsq(X_train, New_features_train, rcond=None)\n",
    "Transformation_matrix = Handmade_fit_transform[0]\n",
    "\n",
    "# Calculate new features \n",
    "X_test2 = np.c_[X_test, np.matmul(X_test, Transformation_matrix)]\n",
    "#X_test2 = np.matmul(X_test, Transformation_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test error comparison without/with additional \"manifold\" features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN test score pure    : 0.977747\n",
      "KNN test score manifold: 0.981919\n",
      "LogisticRegression test score pure    : 0.947149\n",
      "LogisticRegression test score manifold: 0.926287\n"
     ]
    }
   ],
   "source": [
    "print('KNN test score pure    : %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('KNN test score manifold: %f' % knn.fit(X_train2, y_train).score(X_test2, y_test))\n",
    "print('LogisticRegression test score pure    : %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression test score manifold: %f'\n",
    "      % logistic.fit(X_train2, y_train).score(X_test2, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training error comparison without/with additional \"manifold\" features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN train score pure    : 0.987941\n",
      "KNN train score manifold: 0.989796\n",
      "LogisticRegression train score pure    : 0.999072\n",
      "LogisticRegression train score manifold: 1.000000\n"
     ]
    }
   ],
   "source": [
    "print('KNN train score pure    : %f' % knn.fit(X_train, y_train).score(X_train, y_train))\n",
    "print('KNN train score manifold: %f' % knn.fit(X_train2, y_train).score(X_train2, y_train))\n",
    "print('LogisticRegression train score pure    : %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_train, y_train))\n",
    "print('LogisticRegression train score manifold: %f'\n",
    "      % logistic.fit(X_train2, y_train).score(X_train2, y_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check against random feature vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "nof_random_features = 60\n",
    "X_train3 = np.c_[X_train,np.random.randint(low=0,high=16, size = [X_train.shape[0],nof_random_features])]\n",
    "X_test3 = np.c_[X_test,np.random.randint(low=0,high=16, size = [X_test.shape[0],nof_random_features])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN test score pure  : 0.977747\n",
      "KNN test score random: 0.951321\n",
      "LogisticRegression test score pure   : 0.947149\n",
      "LogisticRegression test score random: 0.922114\n"
     ]
    }
   ],
   "source": [
    "print('KNN test score pure  : %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('KNN test score random: %f' % knn.fit(X_train3, y_train).score(X_test3, y_test))\n",
    "print('LogisticRegression test score pure   : %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression test score random: %f'\n",
    "      % logistic.fit(X_train3, y_train).score(X_test3, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Just because we can do it - putting everything together"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Not always a good idea because too many features might make it harder for the algorithms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "nof_additional_features = 50\n",
    "transformer1 = random_projection.SparseRandomProjection(nof_additional_features, random_state=42)\n",
    "transformer2 = random_projection.GaussianRandomProjection(nof_additional_features, random_state=0)\n",
    "transformer3 = manifold.MDS(nof_additional_features, max_iter=300, n_init=3,random_state=11) # works with 20 add feat\n",
    "transformer4 = manifold.TSNE(n_components=3, perplexity=15, random_state=42)\n",
    "transformer5 = PCA(nof_additional_features) # works with 13/20 add feat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "Nft1 = transformer1.fit_transform(X_train)\n",
    "Nft2 = transformer2.fit_transform(X_train)\n",
    "Nft3 = transformer3.fit_transform(X_train)\n",
    "Nft4 = transformer4.fit_transform(X_train)\n",
    "Nft5 = transformer5.fit_transform(X_train)\n",
    "\n",
    "# Calculate transformation for test data by hand (there is no fit function for the transformation)\n",
    "Hft1 = np.linalg.lstsq(X_train, Nft1, rcond=None)\n",
    "Hft2 = np.linalg.lstsq(X_train, Nft2, rcond=None)\n",
    "Hft3 = np.linalg.lstsq(X_train, Nft3, rcond=None)\n",
    "Hft4 = np.linalg.lstsq(X_train, Nft4, rcond=None)\n",
    "Hft5 = np.linalg.lstsq(X_train, Nft5, rcond=None)\n",
    "Mat1 = np.matmul(X_test,Hft1[0])\n",
    "Mat2 = np.matmul(X_test,Hft2[0])\n",
    "Mat3 = np.matmul(X_test,Hft3[0])\n",
    "Mat4 = np.matmul(X_test,Hft4[0])\n",
    "Mat5 = np.matmul(X_test,Hft5[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Put together all features \n",
    "#X_train4 = np.c_[Nft1,Nft2,Nft3,Nft4,Nft5]\n",
    "#X_test4 = np.c_[Mat1,Mat2,Mat3,Mat4,Mat5]\n",
    "#X_train4 = np.c_[X_train,Nft1,Nft2,Nft3,Nft4,Nft5]\n",
    "#X_test4 = np.c_[X_test,Mat1,Mat2,Mat3,Mat4,Mat5]\n",
    "X_train4 = np.c_[X_train,Nft3,Nft4,Nft5] # try this with 50 additional features\n",
    "X_test4 = np.c_[X_test,Mat3,Mat4,Mat5]\n",
    "#X_train4 = np.c_[Nft3,Nft5] # try this with 50 additional features\n",
    "#X_test4 = np.c_[Mat3,Mat5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN test score pure: 0.977747\n",
      "KNN test score all : 0.983310\n",
      "LogisticRegression test score pure: 0.947149\n",
      "LogisticRegression test score all : 0.944367\n"
     ]
    }
   ],
   "source": [
    "print('KNN test score pure: %f' % knn.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('KNN test score all : %f' % knn.fit(X_train4, y_train).score(X_test4, y_test))\n",
    "print('LogisticRegression test score pure: %f'\n",
    "      % logistic.fit(X_train, y_train).score(X_test, y_test))\n",
    "print('LogisticRegression test score all : %f'\n",
    "      % logistic.fit(X_train4, y_train).score(X_test4, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Number of KNN errors now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12.0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "round(X_test.shape[0]-X_test.shape[0]*knn.fit(X_train4, y_train).score(X_test4, y_test),0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
